

# to process data
library(tidyverse)
library(lubridate)

# to manipulate text data
library(quanteda)

# to estimate keyword assisted topic models
library(keyATM)



# load IMF conditionality data for the 74 countries of interest
load("imf_conditionality.RData")


# conditions reference several national oil companies (NOC)
# we don't care about the individual NOC as much as about the fact that it's a NOC
# so replace each mention of a NOC with the abbreviation "noc"  
full_imf_sample <- full_imf_sample %>%
  mutate(text = gsub("extrabudgetary", "extra-budgetary", text),
         text = gsub("Sonelgaz|Sonangol|SOCAR|AzeriGas|Azerigas|Azerigaz|SONABEL|SONABHY|SNH|SONARA|PETROCA|SNPC|SOGARA|
                     PETROCI|PETROCI.|SOMAGAZ|SONIDEP|NNPC|Gazprom|Ukrgazprom|Ukrgazprom.|OTP|Naftogaz|Naftogas|ofNaftogas|PDVSA","noc", text))

# for the main analysis, only keep binding conditions
# note: there are a total of 435 agreements, but 33 of these agreements consist exclusively of non-binding conditions
# e.g. Afghanistan 2016-2019 (id 745) consists of 55 structural benchmarks (SB), which (according to Copelovitch 2010 and others) are not binding
# once we limit the sample to non-binding conditions, we lose these 33 agreements
# hence the total number of agreements mentioned in the paper is 402
binding <- full_imf_sample %>%
  filter(binding==1)


# preparing time index
# divide by a decade
# timestamp should start with 1 (the variable "Period")
binding <- binding %>% 
  mutate(year = year(begin),
         iso3c = as.factor(iso3c)) %>%
  filter(year>1977 & year<2020) %>%
  mutate(period = (year - 1978) %/% 10 + 1) %>%
  mutate(period = as.integer(period)) %>%
  arrange(period)


# generate corpus
key_corpus <- corpus(binding, text_field = "text")

# pre-processing decisions can be arbitrary and misleading (Denny and Spirling 2018)
# so we deliberately do not stem words and do not remove infrequent terms 
data_tokens <- tokens(key_corpus,
                       remove_numbers = TRUE,
                       remove_punct = TRUE,
                       remove_symbols = TRUE,
                       remove_separators = TRUE,
                       remove_url = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(c(stopwords("english")))

# create a document-feature matrix (a dfm object) from a token object
data_dfm <- dfm(data_tokens)


# to investigate how the prevalence of topics changes over time, estimate a dynamic keyATM
# the following is based on this vignette: https://keyatm.github.io/keyATM/articles/pkgdown_files/keyATM_dynamic.html

# select time variables
vars_period <- binding %>% 
  select(year, period)

# read data for keyATM
keyATM_docs <- keyATM_read(texts = data_dfm)
tstat_freq <- textstat_frequency(data_dfm)

# make a list of natural resource keywords
natural_resource_keywords <- list(`Natural Resources`       = c("natural", "extractive", "oil","petroleum",
                                                                "crude", "petroleum", "gas", "gasoline", "diesel","electricity",
                                                                "fuel", "fuels","energy", "refinery", "hydrocarbon", "mineral",
                                                                "mining", "mine", "copper", "gold", "diamond",
                                                                "iron", "steel", "phosphate", "eiti","noc"))


# fit a dynamic keyATM using the list of keywords specified above
# model includes 13 extra topics because that's the number of topics Kentikelenis et al identify
out <-  keyATM(docs               = keyATM_docs,
                no_keyword_topics = 13,
                keywords          = natural_resource_keywords,
                model             = "dynamic",
                model_settings    = list(time_index = vars_period$period,
                                         num_states = 4),
                options           = list(seed = 250))

# what are the top words for each topic? this info is used to generate Table 3
top_words(out)

# how does the prevalence of the natural resource topic change over time? this info is used to generate Figure 4 
plot_timetrend(out, time_index_label = binding$year, xlab = "Year", width = 5, how_topic=1)


# add topic proportions to dataset
theta_values <- as.data.frame(out$theta)
results_binding <- cbind(binding,theta_values)

results_binding <- results_binding %>%
  rename(topic1_natres = `1_Natural Resources`)


save.image(file="workspace_topic_models.Rdata")

